### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### Creates mobility analysis tables for model validation appendix.

library(data.table)
library(lfe)
library(ggplot2)
library(stargazer)


source('0_helper_functions.r')



Table_A9_3_Figure_A9_4 <- function() {  
  if(REPLICATION) {
    spells <- fread('../data/processed/corelogic/share/person_panel_2000_2018_spell_details.csv')
  } else {
    spells <- fread('../data/processed/corelogic/person_panel_2000_2018_spell_details.csv')
  }
  
  # Get the iBuyer prediction 
  prediction <- createiBuyerLikelihoodMeasure_2018()
  prediction[,high.ibuyer := as.integer(iBuyer.prediction.house > quantile(prediction$iBuyer.prediction.house,.5,na.rm=T))]
  
  # Get iBuyer prediction merged
  spells2 <- merge(spells,prediction,by=c('fipscode','pclidirisfrmtd'))
  names(prediction) <- c('fipscode','pclidirisfrmtd','test.next','iBuyer.prediction.house.next','ever.ibuyer.next','high.ibuyer.next')
  spells3 <- merge(spells2,prediction,by.x=c('fipscode','pclidirisfrmtd.next'),by.y=c('fipscode','pclidirisfrmtd'),all.x=T)
  spells3 <- spells3[order(fipscode,name,year)]
  
  
  
  
    qq <- spells3[,j=list(m = mean(sells,na.rm=T)),by=c('year','high.ibuyer')]
  
  
  # Impute house price, LTV, etc.
  last.yearly <- getLastYearlyTransaction()
  fipsYearPrice <- last.yearly[,j=list(fips.price=median(saleamount,na.rm=T)),by=c('fipscode','buy_year')]
  
  step.1 <- merge(spells3,fipsYearPrice,by=c('fipscode','buy_year'))
  step.1 <- merge(step.1,fipsYearPrice,by.x=c('fipscode','year'),by.y = c('fipscode','buy_year'),suffixes=c('.buy','.current'))
  
  step.1[,ltv.original := mortgageamount.this / saleamount.this]
  step.1[,ltv.this     := (mortgageamount.this * (30-tenure) / 30)  / (saleamount.this * (fips.price.current / fips.price.buy))]
  step.1[,ltv.next     := mortgageamount.next / saleamount.next]
  step.1[,ltv.next.buyer := mortgageamount.next.buyer / saleamount.next.buyer]
  step.1[,price.this   := (saleamount.this * (fips.price.current / fips.price.buy))]
  
  step.1 <- step.1[order(fipscode,name,year)]
  step.1[,post := as.integer(year >= 2015)]
  
  # Filter out outliers
  toRegSpells <- step.1[test == 1 & year >= 2008 & year <= 2018 & ltv.original < 1.1 & ltv.this < 2 & (is.na(ltv.next) | ltv.next < 1.1) & (is.na(ltv.next.buyer) | ltv.next.buyer < 1.1) & 
                          land.sqft.this < 5e4 & (is.na(land.sqft.next) | land.sqft.next < 5e4) & living.sqft.this < 5e4 & (is.na(living.sqft.next) | living.sqft.next < 5e4) & 
                          (is.na(saleamount.next) | abs(saleamount.next / price.this - 1) < 2)]
  toRegSales  <- toRegSpells[sells == 1]
  toRegSales[,stays.within := as.integer(!is.na(pclidirisfrmtd.next))]
  toRegSpells <- toRegSpells[year %in% 2013:2017]
  toRegSales <- toRegSales[year %in% 2013:2017]
  
  qq0 <- felm(I(100*sells) ~ high.ibuyer * post  | paste(fipscode,year,tenure),data=toRegSpells[test == 1])
  qq1 <- felm(I(100*(ltv.next - ltv.this)) ~ high.ibuyer * post|   paste(fipscode,year,tenure),data=toRegSales[test == 1]) # Do they sell higher LTV houses?
  qq2 <- felm(I(100*saleamount.next / price.this) ~  high.ibuyer * post |   paste(fipscode,year,tenure),data=toRegSales[test == 1]) # Do they sell higher LTV houses?
  qq3 <- felm(I(100*stays.within) ~ high.ibuyer * post | paste(fipscode,year,tenure),data=toRegSales[test == 1]) # Do they sell higher LTV houses?)
  
  qq0.h <- felm(I(100*sells) ~ high.ibuyer * post  | paste(fipscode,year,tenure),data=toRegSpells[test == 1 & ltv.this >  quantile(toRegSpells$ltv.this,na.rm=T,.75)])
  qq0.l <- felm(I(100*sells) ~ high.ibuyer * post  |  paste(fipscode,year,tenure),data=toRegSpells[test == 1 & ltv.this <=  quantile(toRegSpells$ltv.this,na.rm=T,.75)])
   
  stargazer(qq0,qq0.l,qq0.h,qq3,qq2,qq1,type='html',no.space=T,out='../out/tables/A93.html',dep.var.labels = c('Sells','Remains in market','Delta Buy Price','Delta LTV'),
            keep = c('high.ibuyer','high.ibuyer:post'),
            add.lines = list(c('Sample','All','Low LTV','High LTV','All','All','All'),
                             c('House Controls','Y','Y','Y','Y','Y','Y'),
                             c('Market x Year x Tenure FE','Y','Y','Y','Y','Y','Y')),
            omit.stat = c('f','ser','adj.rsq'))
    
   
    
  
  N = 50
  # Regressions with modeled ibuyer share
  rr0m <- bootstrapRegression(variable = 'I(100*sells)'                               ,data = toRegSpells[test == 1],type = 'model',n = N)
  rr0m.h <- bootstrapRegression(variable = 'I(100*sells)'                               ,data = toRegSpells[test == 1 & ltv.this >  quantile(toRegSpells$ltv.this,na.rm=T,.75)],type = 'model',n = N)
  rr0m.l <- bootstrapRegression(variable = 'I(100*sells)'                               ,data = toRegSpells[test == 1 & ltv.this <= quantile(toRegSpells$ltv.this,na.rm=T,.75)],type = 'model',n = N)
  rr3m <- bootstrapRegression(variable = 'I(100*stays.within)'                        ,data = toRegSales[test == 1] ,type = 'model',n = N)
  
  
  
  plotreg2 <- function(reg,ylabel,ylimits=NA) {
      ggplot(reg) + geom_point(aes(x=x,y=m)) + geom_errorbar(aes(x=x,ymax=u,ymin=l),width=.25) + geom_vline(xintercept = 2014.5,linetype = 'dotted') + geom_hline(yintercept = 0,linetype = 'dotted') + 
        ylab(ylabel) + xlab('Year') + theme_bw() + ylim(ylimits)
    
  }
  
 
  plotreg2(rr0m.h,'Probability of selling',ylimits=c(-1.2,1.2))
  ggsave('../out/figures/A94B.png',height=3.5,width=5,units = 'in')
  
  plotreg2(rr0m.l,'Probability of selling',ylimits=c(-1.2,1.2))
  ggsave('../out/figures/A94C.png',height=3.5,width=5,units = 'in')
  
  plotreg2(rr0m,'Probability of selling',ylimits=c(-.75,.75))
  ggsave('../out/figures/A94A.png',height=3.5,width=5,units = 'in')
  
  plotreg2(rr3m,'Fraction leaving city',ylimits = c(-3,2))
  ggsave('../out/figures/A94D.png',height=3.5,width=5,units = 'in')
  
  
  
}

bootstrapRegression <- function(variable,data,type='actual',n=25) {
  # Bootstrapping level is pclidirisfrmtd
  uniqueHomes <- unique(data$pclidirisfrmtd)
  for(ff in 1:n) {
    myHomes <- sample(uniqueHomes,length(uniqueHomes),replace = T)  
    toReg <- data[pclidirisfrmtd %in% myHomes] 
    if(type == 'actual') {
      expr = paste0(variable, ' ~ ever.ibuyer * as.factor(year) + log(living.sqft.this) + log(living.sqft.this) |  yearbuilt.this  + multistory.this + paste(fipscode,year,tenure)')
      r <- felm(as.formula(expr),data=toReg)
      idx <- grepl(pattern = 'ever\\.ibuyer\\:as\\.factor\\(year\\)',x = rownames(r$coefficients))
      numbers <- as.integer(str_replace(string = rownames(r$coefficients),pattern = 'ever\\.ibuyer\\:as\\.factor\\(year\\)', replacement = ''))[idx]
      m <- as.numeric(r$coefficients)[idx]
      dt <- data.table(run = ff,x = c(min(numbers)-1,numbers),y=c(0,m) )
      
    } else {
        expr = paste0(variable, ' ~ high.ibuyer * as.factor(year) | paste(fipscode,year,tenure) ')
        r <- felm(as.formula(expr),data=toReg[test == 1])
        idx <- grepl(pattern = 'high\\.ibuyer\\:as\\.factor\\(year\\)',x = rownames(r$coefficients))
        numbers <- as.integer(str_replace(string = rownames(r$coefficients),pattern = 'high\\.ibuyer\\:as\\.factor\\(year\\)', replacement = ''))[idx]
        m <- as.numeric(r$coefficients)[idx]
        dt <- data.table(run = ff,x = c(min(numbers)-1,numbers),y=c(0,m) )
    }
    if(ff == 1) {
      result = dt
    } else {
      result = rbind(result,dt)
    }
  }
  qq <- result[,j=list(m=mean(y),l = quantile(y,.025),u = quantile(y,.975)),by=x]
  return(qq)
}


createiBuyerLikelihoodMeasure_2018 <- function(trainSize = .25,regional=F) {
  # Creates iBuyer likelihood measure based on physical characteristics of the house (NOT PRICE!)
  propertyData <- loadData()[!is.na(age.bin) & !is.na(size.bin) & !is.na(multistory)]
  propertyData <- propertyData[ house.age < 100 & land_sqft < 1e5 & land_sqft > 200 & saleamount < 1.5e6 & saleamount > 1e3 & living_sqft < 10000 & living_sqft > 200 & 
                                  !is.na(median.household.income)]
  
  allHouses <- unique(propertyData$id)
  trainHouses <- sample(allHouses,size = floor(length(allHouses) * trainSize),replace = F)
  testHouses <- setdiff(allHouses,trainHouses)
  
  propertyData[,test := 1]
  propertyData[year == 2018,test := 0]
  
  # Create the `is ibuyer` variable. Use data in markets where they've entered & have reasonable market share
  propertyData[,buyer.or.seller := as.integer(iBuyer.buyer | iBuyer.seller)]
  
  ps = 4
  
  # Run the model & predict
  mm <- lm(buyer.or.seller ~ ((age.bin + size.bin + multistory + price.bin) + poly(saleamount,ps)  + poly(house.age,ps) + poly(log(land_sqft),ps) + poly(log(living_sqft),ps)) ,data=propertyData[test == 0])
  propertyData[,iBuyer.prediction.house := predict(mm,propertyData)]  
  propertyData <- propertyData[!duplicated(paste(fipscode,pclidirisfrmtd))]
  
  # Also figure out if it was EVER an iBuyer property
  propertyData[,ever.ibuyer := max(buyer.or.seller),by='pclidirisfrmtd']
  
  if(regional) {
    byZip.early <- propertyData[year %in% 2011:2014 & test == 1,j=list(propensity = mean(iBuyer.prediction.house,na.rm=T),propensity.weighted.price.early = weighted.mean(saleamount,w = iBuyer.prediction.house,na.rm=T)),by='zip5']
    byZip.late <- propertyData[year %in% 2018 ,j=list(propensity.weighted.price.late = weighted.mean(saleamount,w = iBuyer.prediction.house,na.rm=T)),by='zip5']
    byZip <- merge(byZip.early,byZip.late,by='zip5')
    
    return(byZip) 
  } else {
    return(propertyData[,c('fipscode','pclidirisfrmtd','test','iBuyer.prediction.house','ever.ibuyer'),with=F])
  }
}


getLastYearlyTransaction <- function() {
  transactions <- loadData() # Get the cleaned up corelogic transactions
  
  # Create cleaned buyername
  transactions[,buyername_cleaned := trimws(paste(owner1firstnamemi,owner1lastname))]
  transactions[,buyername_cleaned := trimws(gsub(pattern = '[^[:alnum:] ]',x = buyername_cleaned,replace = ''))]
  
  # All people who owned properties. First, filter out corporations and 
  transactions[,n := sum(absenteeownerstatus != 'Y'),by='buyername_cleaned'] # If you have a ton of absentee owner stuff we're skeptical.....
  transactions[,zip5 := substr(situszipcode,1,5)]
  transactions[,buy_year := year]
  transactions$year <- NULL
  
  # Get rid of corporations, people with transactions > 10, and guys with corporate names.
  corpNames = c('[0-9]|ARIZONA|CORP|LLC|INC|LP|SALES|CONSTRUCTION|TRUST|INC|unknown|HOMES|FNAM|DEV|JR|HUD|LTD|INS|MAE|ASSN|MARKETING|RECONVEYANCE|BK|FSB|CORPORATION|PROPERTIES|ADMN|ASSOCIATES|BANK|AMERICA|CORPS|DEVELOPMENT|AFFAIRS|INVESTMENTS|COUNTY|CORPORAT|HMS')
  transactions[,corpname := grepl(x = buyername_cleaned,pattern = corpNames,ignore.case = T)]  
  
  transactions[,last.yearly.transaction := max(date),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- transactions[date == last.yearly.transaction ]
  
  # Get tha max sale amount . Get the biggest one.
  last.yearly[,max.sale.amount := max(saleamount),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- last.yearly[saleamount == max.sale.amount]
  
  # Now if it's duplicated it's just bullshit. Just drop one of them.
  last.yearly <- last.yearly[!duplicated(paste(fipscode,pclidirisfrmtd,buy_year))]
  
  # Okay---each property has at most one transaction in a year. Want to get when that property is sold. This is the next thing in the transactions. Note we should have
  # the same number of properties.
  last.yearly <- last.yearly[order(fipscode,pclidirisfrmtd,buy_year)]
  last.yearly[,name := buyername_cleaned]
  
  
  return(last.yearly)
}

Table_A9_3_Figure_A9_4()

